今天寫個短短的~
談一下建立模型的部分
今天先大致訓練一個模型。而後我們在花幾個篇幅慢慢調整模型的問題。畢竟先求有再求好嘛!
# 引入必要的套件
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
# 1. 數據讀取與檢查
data = pd.read_csv('label_data_with_stoch.csv')
# 2. 創建具有5天歷史資料的新特徵
columns_to_use = ['open', 'high', 'low', 'close', 'volume', 'slowk_5_3_3', 'slowd_5_3_3']
for col in columns_to_use:
for i in range(1, 6):
data[f'{col}_lag_{i}'] = data[col].shift(i)
data = data.dropna() # 移除含有 NaN 值的列
# 3. 數據預處理
# 將數據集分為訓練集和測試集 (80% 訓練, 20% 測試)
train_data = data.sample(frac=0.8, random_state=42)
test_data = data.drop(train_data.index)
# 選擇特徵和目標變量
features = [col for col in data.columns if '_lag_' in col]
X_train = train_data[features]
y_train = train_data['label']
X_test = test_data[features]
y_test = test_data['label']
# 4. 建立隨機森林模型
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
# 進行預測
y_pred_train = rf_model.predict(X_train)
y_pred_test = rf_model.predict(X_test)
# 計算準確度
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)
# 5. 導出模型
joblib.dump(rf_model, 'random_forest_model.pkl')
然後我們再做一個預測用的函式。
import pandas as pd
import joblib
def predict(input_data):
# 載入預先訓練的模型
model = joblib.load('random_forest_model.pkl')
# 進行預測
predictions = model.predict(input_data)
return predictions
明天我們就可以來坐回測分析~ (今天水一波~~